{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1041ae6f",
   "metadata": {},
   "source": [
    "![iceberg-logo](https://www.apache.org/logos/res/iceberg/iceberg.png)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "247fb2ab",
   "metadata": {},
   "source": [
    "### [Table Maintenance: The Key To Keeping Your Iceberg Tables Healthy and Performant](https://tabular.io/blog/table-maintenance/)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a5c8206",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dab5ef0",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"DROP TABLE IF EXISTS demo.nyc.taxis_sample\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49a45d0b",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"\"\"\n",
    "CREATE TABLE demo.nyc.taxis_sample (\n",
    "  `VendorID` BIGINT,\n",
    "  `tpep_pickup_datetime` TIMESTAMP,\n",
    "  `tpep_dropoff_datetime` TIMESTAMP,\n",
    "  `passenger_count` DOUBLE,\n",
    "  `trip_distance` DOUBLE,\n",
    "  `RatecodeID` DOUBLE,\n",
    "  `store_and_fwd_flag` STRING,\n",
    "  `PULocationID` BIGINT,\n",
    "  `DOLocationID` BIGINT,\n",
    "  `payment_type` BIGINT,\n",
    "  `fare_amount` DOUBLE,\n",
    "  `extra` DOUBLE,\n",
    "  `mta_tax` DOUBLE,\n",
    "  `tip_amount` DOUBLE,\n",
    "  `tolls_amount` DOUBLE,\n",
    "  `improvement_surcharge` DOUBLE,\n",
    "  `total_amount` DOUBLE,\n",
    "  `congestion_surcharge` DOUBLE,\n",
    "  `airport_fee` DOUBLE)\n",
    "USING iceberg\n",
    "TBLPROPERTIES(\n",
    "  'write.target-file-size-bytes'='5242880'\n",
    ")\n",
    "\"\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1f30120",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -Shs /home/iceberg/data/yellow_tripdata_*.parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "997bb9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "val df_202201 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-01.parquet\")\n",
    "val df_202202 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-02.parquet\")\n",
    "val df_202203 = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2022-03.parquet\")\n",
    "val df_q1 = df_202201.union(df_202202).union(df_202203)\n",
    "df_q1.write.insertInto(\"nyc.taxis_sample\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78cab088",
   "metadata": {},
   "source": [
    "## Rewriting Data Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ad64e6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d5d10355",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"ALTER TABLE nyc.taxis_sample UNSET TBLPROPERTIES ('write.target-file-size-bytes')\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f26228a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"CALL demo.system.rewrite_data_files(table => 'nyc.taxis_sample', options => map('target-file-size-bytes','52428800'))\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "43a9ed67",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"SELECT file_path, file_size_in_bytes FROM nyc.taxis_sample.files\").show(100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "523eb893",
   "metadata": {},
   "source": [
    "## Expiring Snapshots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a8f21e73",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98e8c5db",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b264c989",
   "metadata": {},
   "outputs": [],
   "source": [
    "val now = java.util.Calendar.getInstance().getTime()\n",
    "val format = new java.text.SimpleDateFormat(\"yyyy-MM-dd HH:mm:ss.SSS\")\n",
    "val now_str = format.format(now)\n",
    "\n",
    "spark.sql(s\"CALL demo.system.expire_snapshots(table => 'nyc.taxis_sample', older_than => TIMESTAMP '$now_str', retain_last => 1)\").show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "131e1f09",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"SELECT committed_at, snapshot_id, operation FROM nyc.taxis_sample.snapshots\").show(truncate=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f810ebb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c60631b7",
   "metadata": {},
   "source": [
    "## Removing Orphan Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3dd93ad5",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "94ad504b",
   "metadata": {},
   "outputs": [],
   "source": [
    "!touch -d \"2022-01-01 00:00:00.000000000\" /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-1.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-2.parquet /home/iceberg/warehouse/nyc/taxis_sample/data/i-shouldnt-be-here-3.parquet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "226f74da",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f22673aa",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample', dry_run => true)\").show(truncate=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3444042b",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(s\"CALL demo.system.remove_orphan_files(table => 'nyc.taxis_sample')\").show(truncate=false)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d157881",
   "metadata": {},
   "outputs": [],
   "source": [
    "!ls -Shs /home/iceberg/warehouse/nyc/taxis_sample/data/"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "181212b6",
   "metadata": {},
   "source": [
    "## Rewriting Manifest Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49290e56",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.sql(\"CALL demo.system.rewrite_manifests('nyc.taxis_sample')\").show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "spylon-kernel",
   "language": "scala",
   "name": "spylon-kernel"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".scala",
   "help_links": [
    {
     "text": "MetaKernel Magics",
     "url": "https://metakernel.readthedocs.io/en/latest/source/README.html"
    }
   ],
   "mimetype": "text/x-scala",
   "name": "scala",
   "pygments_lexer": "scala",
   "version": "0.4.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
